## Daniel J. Mallinson and Darrell Lovell
## Reproduction Script for "Pencils Down. for Good? 
## The Expansion of Test-optional Policy after COVID-19"
## 2023-08-28

####################### Script for Reproducing Data Assembly ##################

## Access Urban Institute API
# https://github.com/UrbanInstitute/education-data-package-r 
# https://educationdata.urban.org/documentation/#how_to_use
# Codebook: https://educationdata.urban.org/documentation/colleges.html

install.packages('devtools')
devtools::install_github('UrbanInstitute/education-data-package-r')

library(educationdata)
library(reshape2)
library(psych)
library(zoo)
library(multicon)
library(dplyr)
library(data.table)
library(tidyverse)

## Tuition
ys = 2004:2019

tuition = data.table::fread("https://educationdata.urban.org/csv/ipeds/colleges_ipeds_ay_tuition_fees.csv")%>%
  filter(
    level_of_study==1,
    tuition_type==3,
    year%in%ys)%>%
  select(unitid, year, tuition_type, tuition_fees_ft)

tuition <- as.data.frame(tuition)

#Pell Grants
years = 2004:2017
pell =data.table::fread("https://educationdata.urban.org/csv/ipeds/colleges_ipeds_finance.csv")%>%
  filter(year%in%years)%>%
  select("unitid", "year", "sch_pell_grant", "sch_total_student_aid")

pell <- as.data.frame(pell)

#Enrollment
ys = 2004:2018

enroll = data.table::fread("https://educationdata.urban.org/csv/ipeds/colleges_ipeds_admissions-enrollment.csv")%>%
  filter(
    year%in%ys,
    sex==99)%>%
  select("unitid", "year", "number_applied", "number_admitted", "number_enrolled_ft")

enroll <- as.data.frame(enroll)

#Directory
ys = 2004:2019

directory = data.table::fread("https://educationdata.urban.org/csv/ipeds/colleges_ipeds_directory.csv")%>%
  filter(
    year%in%ys)%>%
  select("unitid", "year", "inst_name", 
         "address", "state_abbr", "zip","offering_highest_level",
         "region", "inst_control", "institution_level", 
         "sector", "fips", "hbcu")

directory <- as.data.frame(directory)

#Enrollment FTE
ys = 2004:2018

enroll = data.table::fread("https://educationdata.urban.org/csv/ipeds/colleges_ipeds_admissions-enrollment.csv")%>%
  filter(
    year%in%ys,
    sex==99)%>%
  select("unitid", "year", "number_applied", "number_admitted", "number_enrolled_ft")

enroll <- as.data.frame(enroll)

#Grad Rates
years <- c(2004:2017)
grad_rate =data.table::fread("https://educationdata.urban.org/csv/ipeds/colleges_ipeds_grad-rates.csv") 
grad_rate_2 = grad_rate%>%
  filter(year%in%years,
         subcohort == 99,
         sex == 99,
         race%in%c(1,2,3,99))%>%
  select("unitid", "year", "race", "completion_rate_150pct")%>%
  group_by(unitid,year)%>%
  spread(race,completion_rate_150pct)%>%
  rename(white_gradrate = `1`,
         black_gradrate = `2`,
         hispanic_gradrate = `3`,
         total_gradrate = `99`)

grad_rate_2 <- as.data.frame(grad_rate_2)

#Race (Fall Enrollment)
years <- 2004:2018
data_list = vector(mode = "list", length = length(years))
for(i in 1:length(years)){
  data_list[[i]] = as.data.frame(data.table::fread(paste0("https://educationdata.urban.org/csv/ipeds/colleges_ipeds_fall-enrollment-race_",years[i],".csv")))%>%
    filter(degree_seeking==1,
           class_level==99,
           sex==99,
           ftpt==1,
           race%in%c(1,2,3,99))%>%
    group_by(unitid, year)%>%
    spread(race,enrollment_fall)
}
enrollment = bind_rows(data_list[1:length(years)])%>%
  rename(white_enrollment = `1`,
         black_enrollment = `2`,
         hispanic_enrollment = `3`,
         total_enrollment = `99`)%>%
  select(-c(sex,ftpt,class_level,degree_seeking, level_of_study))

enrollment <- as.data.frame(enrollment)

## SAT and ACT Scores
ys = 2004:2018
testscores = data.table::fread("https://educationdata.urban.org/csv/ipeds/colleges_ipeds_admissions-requirements.csv")%>%
  filter(
    year%in%ys)%>%
  select("year", "unitid", "open_admissions_policy","sat_crit_read_75_pctl", "sat_math_75_pctl", "act_composite_75_pctl", "sat_percent_submitting", "act_percent_submitting")
testscores <- as.data.frame(testscores)

testscores$sat_percent_submitting[is.na(testscores$sat_percent_submitting)] <- 0
testscores$act_percent_submitting[is.na(testscores$act_percent_submitting)] <- 0
testscores$sat_crit_read_75_pctl[is.na(testscores$sat_crit_read_75_pctl)] <- 0
testscores$sat_math_75_pctl[is.na(testscores$sat_math_75_pctl)] <- 0
testscores$act_composite_75_pctl[is.na(testscores$act_composite_75_pctl)] <- 0

#set aside open enrollment
open <- testscores[c("year", "unitid", "open_admissions_policy")]
open <- open[which(open$year==2018),]
testscores$open_admissions_policy <- NULL

## Geocoding Variables
ys = 2004:2017
geo = data.table::fread("https://educationdata.urban.org/csv/nhgis/colleges_nhgis_geog_2010.csv")%>%
  filter(
    year%in%ys)%>%
  select("year", "unitid", "geo_longitude", "geo_latitude")
geo <- as.data.frame(geo)

## Merge data

data <- merge(directory, tuition, by=c("unitid", "year"), all.x=TRUE, all.y=TRUE)
data <- merge(data, geo, by=c("unitid", "year"), all.x=TRUE, all.y=TRUE)
data <- merge(data, pell, by=c("unitid", "year"), all.x=TRUE, all.y=TRUE)
data <- merge(data, enroll, by=c("unitid", "year"), all.x=TRUE, all.y=TRUE)
data <- merge(data, grad_rate_2, by=c("unitid", "year"), all.x=TRUE, all.y=TRUE)
data <- merge(data, enrollment, by=c("unitid", "year"), all.x=TRUE, all.y=TRUE)
data <- merge(data, testscores, by=c("unitid", "year"), all.x=TRUE, all.y=TRUE)

## Drop Private For-Profit Institutions
forprofit <- unique(data$unitid[data$inst_control==3])

data <- data[!(data$unitid %in% forprofit),]

## Interpolate and Copy for NAs

data$check_2 = rowSums(!is.na(data[,15:36]))>0

data1 = data %>%
  group_by(unitid)%>%
  mutate(check_1 = n()==16)%>%
  filter(check_1 == T,
         check_2 == T)

for(i in 1:length(unique(data$unitid))){
  usedata <- data1[which(data1$unitid==unique(data1$unitid)[i]),]
  usedata <- rbind(usedata, usedata[nrow(usedata),])
  usedata$year[nrow(usedata)] = 2020
  if(i==1){
    data1_a <- usedata
  }else{
    data1_a <- rbind(data1_a, usedata)
  }
}
data1 = data1_a%>%
  mutate(across(15:36,~na.approx(.,method="linear",rule = 2, na.rm=F)))%>%
  select(-c(check_1,check_2))

schools <- read.csv("schools.csv") #Read in adopters

data1$right_censor <- 0
data1$right_censor[!(data1$unitid %in% unique(schools$unitid))] <- 1

adopters <- data1[which(data1$right_censor==0),]
censored <- data1[which(data1$right_censor==1),]

censored <- censored[which(censored$year==2020),]
censored$school <- censored$permenant <- censored$temp_year <- censored$GPA <- censored$blind <- NA
censored$left_censor <- 0

adopters <- merge(adopters, schools, by=c("unitid", "year"), all.x=FALSE, all.y=FALSE)

data2 <- rbind(adopters,censored)

# Add time counter for survival analysis

data2$time <- data2$year - 2003

#Remove schools with 0 and NA applications, admissions, tuition

data2 <- data2[which(data2$tuition_fees_ft > 0),]
data2 <- data2[which(data2$number_admitted > 0),]
data2 <- data2[which(data2$number_applied > 0),]
data2 <- data2[which(data2$number_enrolled_ft > 0),]

# Calculate selectivity

data2$selectivity <- data2$number_admitted/data2$number_applied*100

#Calculate Pell percentage
data2$pellpct <- data2$sch_pell_grant/data2$sch_total_student_aid*100

#Remove one pell error (>300 percent)
data2 <- data2[which(data2$unitid != 219170),]

# Remove Open Enrollment Schools

data2 <- data2[!(data2$unitid %in% open$unitid[open$open_admissions_policy==1]),]

# Create racial categories percentages
data2$whitepct <- data2$white_enrollment / data2$total_enrollment*100
data2$blackpct <- data2$black_enrollment / data2$total_enrollment*100
data2$hispanicpct <- data2$hispanic_enrollment / data2$total_enrollment*100

#Create indicator of COVID adoption
data2$covid <- 0
data2$covid[data2$year==2020 & data2$right_censor == 0] <- 1

#Create indicator of adoption anytime
data2$adopt <- 0
data2$adopt[data2$right_censor==0] <- 1

#Create labels for adopter categories
data2$label <- "Non-Adopter"
data2$label[data2$right_censor==0 & data2$covid==0] <- "Pre-COVID"
data2$label[data2$right_censor==0 & data2$covid==1] <- "COVID"

#Convert to percentages
data2$total_gradrate <- data2$total_gradrate*100

#Convert tuition to $1000s
data2$tuition_fees_ft <- data2$tuition_fees_ft/1000

#Create ACT to SAT concordance matrix
# https://www.r-bloggers.com/2012/01/act-to-sat-mv-concordance-chart-in-r/
act <- c((36:9),0)
sat <- c(1590, 1540, 1500,  1460, 1430, 1400, 1370, 1340, 1310, 1280, 1240,
         1210, 1180, 1140, 1100, 1080, 1040, 1010, 970, 930, 890, 850, 800, 760,
         710, 670, 630, 590, 0)
act2sat <- data.frame(act, sat)

#Convert ACT to SAT

#Create new SAT variable using most submitted test score

data2$testscore <- NA

for(i in 1:nrow(data2)){
  if(data2$act_percent_submitting[i]==0 & data2$sat_percent_submitting[i]==0){
    data2$testscore[i] <- NA
  }else{
    if(data2$act_percent_submitting[i] > data2$sat_percent_submitting[i]){
      data2$testscore[i] <- act2sat$sat[which(act2sat$act==data2$act_composite_75_pctl[i])]
    }else{
      data2$testscore[i] <- data2$sat_crit_read_75_pctl[i] + data2$sat_math_75_pctl[i] 
    }
  }
}

data2$testscore[which(data2$testscore==0)] <- NA

# Calculate tuition adjusted based on the Higher Education Price Index
#https://www.commonfund.org/higher-education-price-index

year <- c(2004:2020)
hepi.values <- c(231.7, 240.8, 253.1, 260.3, 273.2, 279.3, 281.8, 288.4, 293.2, 297.8, 
                 306.7, 312.9, 317.7, 327.4, 336.1, 346.0, 352.7)
hepi <- as.data.frame(cbind(year, hepi.values))

data2 <- merge(data2, hepi, by="year")
data2$tuition_real <- data2$tuition_fees_ft*352.7/data2$hepi.values #Calculated in 2020 dollars

data2$number_enrolled_ft <- data2$number_enrolled_ft/100

## Create intervals for left censoring

data2$int1 <- data2$time
data2$int1[data2$int1==1] <- NA

data2$int2 <- data2$time
data2$int2[data2$adopt==0 & data2$left_censor==0] <- NA

# Remove territories
data2 <- data2[!(data2$state_abbr %in% c("VI", "PR", "GU", "FM")),]

write.csv(data2, "test_optional_data_allschools_ihe.csv")

########################## Script for Reproducing Analysis ####################

library(ggpubr)
library(car)
library(survival)
library(SurvRegCensCov)
library(gridExtra)

data <- read.csv("test_optional_data_allschools_ihe.csv")

## Figure 1 - Map of adopters
#Packages for Geocoding
library(ggmap)
library(tmaptools)
library(RCurl)
library(jsonlite)
library(tidyverse)
library(leaflet)
library(translateR)
library(maps)
library(usmap)
library(rgdal)
library(dplyr)

TestOptional <- data[c("geo_longitude", "geo_latitude", "unitid", "year", "adopt", "covid", "right_censor")]
TestOptional <- na.omit(TestOptional)

#Address Geocoding 

#######################Delete Before Sharing#######################
#API KEY
#register_google(key = "") #Need to add Google key
#######################Delete Before Sharing#######################

#Mapping Results
TestOptionalGC_US <- usmap_transform(TestOptional)

#Pre-covid
TestOptionalGC_US_precov <- TestOptionalGC_US[which(TestOptionalGC_US$covid==0),]
TestOptionalGC_US_cov <- TestOptionalGC_US[which(TestOptionalGC_US$covid==1),]

plot_usmap() +
  geom_point(data = TestOptionalGC_US, aes(x = geo_longitude.1, y = geo_latitude.1),
             color = "black", alpha = 0.25) +
  labs(title = "Test Optional Higher Education Institutions") 

png("figure1.png", height=3, width=8, units="in", res=600)
a <- plot_usmap() +
  geom_point(data = TestOptionalGC_US_precov, aes(x = geo_longitude.1, y = geo_latitude.1),
             color = "black", alpha = 1, size=.75) +
  ggtitle("Before 2020")+
  theme(plot.title = element_text(hjust = 0.5, face="bold"))
  #labs(title = "Before 2020") 

b <- plot_usmap() +
  geom_point(data = TestOptionalGC_US_cov, aes(x = geo_longitude.1, y = geo_latitude.1),
             color = "red", alpha = 1, size=.75) +
  geom_point(data = TestOptionalGC_US_precov, aes(x = geo_longitude.1, y = geo_latitude.1),
             color = "black", alpha = 1, size=.75) +
  ggtitle("2020")+
  theme(plot.title = element_text(hjust = 0.5, face="bold"))
  #labs(title = "2020") 

grid.arrange(a,b,ncol=2)
dev.off()

## Table 1: Descriptive Statistics

#Sample
sample <- data[c("time", "adopt", "inst_control", "tuition_real", "selectivity", "number_enrolled_ft",
                 "blackpct", "hispanicpct", "pellpct", "total_gradrate", "covid", "testscore")]

#sample <- na.omit(sample)
#nrow(sample)

describe(sample[c("time", "adopt", "inst_control", "tuition_real", "selectivity", "number_enrolled_ft",
                "blackpct", "hispanicpct", "pellpct", "total_gradrate", "covid", "testscore")], skew=FALSE, na.rm=TRUE)

#Types of schools
table(data$inst_control)

##HBCU
nrow(data[which(data$hbcu==1),]) #61
nrow(data[which(data$hbcu==1 & data$right_censor==0 & data$covid==0),]) #31
31/61
nrow(data[which(data$hbcu==1 & data$right_censor==0 & data$covid==1),]) #19
19/61
50/61

## Table 2: ANOVA Results
## ANOVAs

## Race
# Plot weight by group
# Add error bars: mean_se
# (other values include: mean_sd, mean_ci, median_iqr, ....)

#Pct Black

ggline(data, x = "label", y = "blackpct", 
       add = c("mean_ci", "jitter"), 
       color="black",
       point.color="red",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Percent African American", xlab = "")

leveneTest(blackpct ~ label, data=data)
oneway.test(blackpct ~ label, data=data, var.equal=FALSE)

pairwise.t.test(data$blackpct, data$label, p.adjust.method="BH", pool.sd=FALSE)

aggregate(data$blackpct, list(data$label), FUN=mean, na.rm=TRUE, na.action=na.pass)

#Pct Hispanic
ggline(data, x = "label", y = "hispanicpct", 
       add = c("mean_ci", "jitter"), 
       color="black",
       point.color="red",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Percent Hispanic", xlab = "")

leveneTest(hispanicpct ~ label, data=data)
oneway.test(hispanicpct ~ label, data=data, var.equal=FALSE)

pairwise.t.test(data$hispanicpct, data$label, p.adjust.method="BH", pool.sd=FALSE)

aggregate(data$hispanicpct, list(data$label), FUN=mean, na.rm=TRUE, na.action=na.pass)

##Tuition
ggline(data, x = "label", y = "tuition_real", 
       add = c("mean_ci", "jitter"), 
       color="black",
       point.color="red",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Tuition", xlab = "")

leveneTest(tuition_real ~ label, data=data)
oneway.test(tuition_real ~ label, data=data, var.equal=FALSE)

pairwise.t.test(data$tuition_real, data$label, p.adjust.method="BH", pool.sd=FALSE)

aggregate(data$tuition_real, list(data$label), FUN=mean, na.rm=TRUE, na.action=na.pass)

##Pell Grants
ggline(data, x = "label", y = "pellpct", 
       add = c("mean_ci", "jitter"), 
       color="black",
       point.color="red",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Percent Pell Funding", xlab = "")

leveneTest(pellpct ~ label, data=data)
oneway.test(pellpct ~ label, data=data, var.equal=FALSE)

pairwise.t.test(data$pellpct, data$label, p.adjust.method="BH", pool.sd=FALSE)

aggregate(data$pellpct, list(data$label), FUN=mean, na.rm=TRUE, na.action=na.pass)

#Average Test Scores
ggline(data, x = "label", y = "testscore", 
       add = c("mean_ci", "jitter"), 
       color="black",
       point.color="red",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Average Test Score", xlab = "")

leveneTest(testscore ~ label, data=data)
oneway.test(testscore ~ label, data=data, var.equal=FALSE)

pairwise.t.test(data$testscore, data$label, p.adjust.method="BH", pool.sd=FALSE)

aggregate(data$testscore, list(data$label), FUN=mean, na.rm=TRUE, na.action=na.pass)

##Selectivity
#Admissions Rate

ggline(data, x = "label", y = "selectivity", 
       add = c("mean_ci", "jitter"), 
       color="black",
       point.color="red",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Tdatation", xlab = "")

leveneTest(selectivity ~ label, data=data)
oneway.test(selectivity ~ label, data=data, var.equal=FALSE)

pairwise.t.test(data$selectivity, data$label, p.adjust.method="BH", pool.sd=FALSE)

aggregate(data$selectivity, list(data$label), FUN=mean, na.rm=TRUE, na.action=na.pass)

##Graduation Rate
ggline(data, x = "label", y = "total_gradrate", 
       add = c("mean_ci", "jitter"), 
       color="black",
       point.color="red",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Graduation Rate", xlab = "")

leveneTest(total_gradrate ~ label, data=data)
oneway.test(total_gradrate ~ label, data=data, var.equal=FALSE)

pairwise.t.test(data$total_gradrate, data$label, p.adjust.method="BH", pool.sd=FALSE)

aggregate(data$total_gradrate, list(data$label), FUN=mean, na.rm=TRUE, na.action=na.pass)

## Figure 2 

png("figure2.png", height=11.5, width=8, units="in", res=600)
a <- ggline(data, x = "label", y = "blackpct", 
       add = c("mean_ci", "jitter"), 
       color="gray",
       point.color="black",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Percent African American", xlab = "(a)")

b <- ggline(data, x = "label", y = "hispanicpct", 
            add = c("mean_ci", "jitter"), 
            color="gray",
            point.color="black",
            point.size=2,
            order = c("Non-Adopter", "Pre-COVID", "COVID"),
            ylab = "Percent Hispanic", xlab = "(b)")

c <- ggline(data, x = "label", y = "tuition_real", 
            add = c("mean_ci", "jitter"), 
            color="gray",
            point.color="black",
            point.size=2,
            order = c("Non-Adopter", "Pre-COVID", "COVID"),
            ylab = "Tuition", xlab = "(c)")

d <- ggline(data, x = "label", y = "pellpct", 
            add = c("mean_ci", "jitter"), 
            color="gray",
            point.color="black",
            point.size=2,
            order = c("Non-Adopter", "Pre-COVID", "COVID"),
            ylab = "Pell Percentage", xlab = "(d)")

e <- ggline(data, x = "label", y = "testscore", 
            add = c("mean_ci", "jitter"), 
            color="gray",
            point.color="black",
            point.size=2,
            order = c("Non-Adopter", "Pre-COVID", "COVID"),
            ylab = "Average Test Scores", xlab = "(e)")

f <- ggline(data, x = "label", y = "selectivity", 
       add = c("mean_ci", "jitter"), 
       color="gray",
       point.color="black",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Admissions Rate", xlab = "(f)")

g <- ggline(data, x = "label", y = "total_gradrate", 
       add = c("mean_ci", "jitter"), 
       color="gray",
       point.color="black",
       point.size=2,
       order = c("Non-Adopter", "Pre-COVID", "COVID"),
       ylab = "Graduation Rate", xlab = "(g)")

grid.arrange(a,b,c,d,e,f,g, ncol=2)
dev.off()

## Figure 3
png("figure3.png", height=6, width=6, units="in", res=600)
plot(survfit(Surv(int1, int2, type="interval2")~1, data=data), xlab="Years since 2003", ylab="Overall Survival Probability", ylim=c(0,1))
dev.off()

## Table 2

## Survival Model with all institutions

#Model 1 <- All Data

m1 <- WeibullReg(Surv(int1, int2, type="interval2") ~ inst_control + tuition_real + selectivity + number_enrolled_ft + blackpct + hispanicpct + pellpct + total_gradrate, data=data)
m1

#Model 2 <- 100 enrollment
m2 <- WeibullReg(Surv(int1, int2, type="interval2") ~ inst_control + tuition_real + selectivity + number_enrolled_ft + blackpct + hispanicpct + pellpct + total_gradrate, data=data[which(data$number_enrolled_ft>1),])
m2

#Model 3 <- Add composite test scores
m3 <- WeibullReg(Surv(int1, int2, type="interval2") ~ inst_control + tuition_real + selectivity + number_enrolled_ft + blackpct + hispanicpct + pellpct + total_gradrate + testscore, data=data)
m3

## Logit model for COVID adoptions
#Model 4
m4 <- glm(covid ~ inst_control + tuition_real + selectivity + number_enrolled_ft + blackpct + hispanicpct + pellpct + total_gradrate + testscore, family=binomial(link="logit"), data=data[which(data$right_censor==0),])
summary(m4)
nobs(m4)

X <- summary(m4)$coef

or <- as.data.frame(X[,1:2])
or$odds <- exp(or[,1])
or$odds.lowerci <- exp(or[,1] - (1.96*or[,2]))
or$odds.upperci <- exp(or[,1] + (1.96*or[,2]))
or

## Temporary Adoptions
table(data$adopt, data$covid, data$temp_year)
mean(data$tuition_real[data$permenant==0 & data$adopt==1]) #Temporary adopters
mean(data$tuition_real[data$permenant==1 & data$adopt==1 & data$covid==1]) #Permanent COVID adopters
mean(data$tuition_real[data$covid==0 & data$adopt==1]) #Pre-COVID adopters
